library(renv)
renv::load()
renv::restore() # Install missing librairies.
renv::status() # Check the project state.Modelling eel abundance in the Vilaine bassin
0. Setup
Load dependencies.
library(zen4R)
library(here)Download data from Zenodo. The data file is relatively heavy (~800Mb), this operation can take a few minutes.
doi <- "10.5281/zenodo.17962542"
dest_dir <- here("data")if (!dir.exists(dest_dir)) dir.create(dest_dir, recursive = TRUE)
download_zenodo(doi, path = dest_dir)
unzip(here(dest_dir, "miste_data.zip"), exdir = dest_dir)If everything went well, the data should have been downloaded in data/zenodo/.
1. River data
First, we focus on the river geographic data. Let’s load it.
base::load(here(dest_dir, "zenodo", "processed_data", "reseaux_hydrographiques.rda"))For now, we will focus on the Vilaine bassin. We can view it using sfnetworks.
library(sf)
library(sfnetworks)
net <- as_sfnetwork(rht_vilaine)
plot(net, cex = 0.5)We can see that the hydrographic network is not entirely connected. For example, a small portion in the south west is disconnected from the rest of the bassin. We should keep this in mind for later, as it could bias our statistical model.
We want to check that the network data is valid. In particular, we want to verify that there is no duplicate in the nodes, or edges data.
nodes <- net |>
activate("nodes") |>
st_as_sf()
sum(duplicated(nodes))[1] 0
library(dplyr)
library(tidyr)
edges <- net |>
activate("edges") |>
st_as_sf()
sum(duplicated(select(edges, c("from", "to"))))[1] 0
We see that there is no duplication for the Vilaine network. We can proceed to the next step.
2. Environmental data
2.1 Load data
base::load(here(dest_dir, "zenodo", "processed_data", "poissons_env_temp.RData"))
env# A tibble: 5,777 × 14
sta_id pop_id ope_id ope_date annee distance_mer altitude
<int> <int> <int> <dttm> <dbl> <int> <int>
1 9655 37720 22622 1996-09-24 10:09:00 1996 NA 305
2 9225 35420 16755 1998-06-19 09:30:00 1998 NA 80
3 9622 37544 16012 2000-05-22 14:45:00 2000 328 98
4 9762 38357 17091 2006-09-28 14:00:00 2006 216 31
5 9384 36157 17349 2006-06-21 14:30:00 2006 482 140
6 9712 38066 17621 2016-09-08 10:30:00 2016 400 130
7 9454 36580 17379 2010-09-14 15:00:00 2010 500 124
8 9520 36941 17079 2007-06-14 14:30:00 2007 224 37
9 9381 36135 17154 2006-06-20 00:00:00 2006 387 100
10 9284 35724 17596 2016-09-29 10:00:00 2016 212 32
# ℹ 5,767 more rows
# ℹ 7 more variables: surface_bv <dbl>, distance_source <dbl>, largeur <dbl>,
# pente <dbl>, profondeur <dbl>, temp_juillet <dbl>, temp_janvier <dbl>
Let’s rename column for clarity.
env <- env |> rename(year = annee)Station geographical coordinates are stored in points_geo_<river>. We visualize them over the hydrographic network.
plot(net, col = "black", cex = 0.5)
plot(points_geo_vilaine, col = "red", add = TRUE, pch = 16, cex = 0.5)2.2. Visualisation of altitude data
To begin with, let’s focus on a single environment variable: the altitude.
d_env <- env |>
select(c("sta_id", "pop_id", "altitude")) |>
distinct()
d_env |> filter(is.na(altitude))# A tibble: 2 × 3
sta_id pop_id altitude
<int> <int> <int>
1 8853 33379 NA
2 13994 51300 NA
We see that we have missing values for two stations: 8853 and 13994. Next, we want to relate altitude values to geographical coordinates.
pts_vilaine <- points_geo_vilaine |>
select(c("pop_id", "geometry")) |>
left_join(d_env, by = "pop_id") |>
filter(!is.na(altitude))
pts_vilaineSimple feature collection with 37 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -2.965245 ymin: 47.46923 xmax: -1.051178 ymax: 48.3493
Geodetic CRS: WGS 84
First 10 features:
pop_id sta_id altitude geometry
1 46885 12253 25 POINT (-2.429369 47.71654)
2 46166 11854 176 POINT (-2.965245 48.32017)
3 47601 12434 11 POINT (-1.796815 47.46923)
4 46671 12118 9 POINT (-2.332695 47.76529)
5 47659 12447 3 POINT (-2.142875 47.5878)
6 47613 12436 5 POINT (-1.867647 47.70129)
7 46484 12007 41 POINT (-2.35679 48.01553)
8 47460 12407 48 POINT (-1.410521 47.71718)
9 47677 12451 16 POINT (-2.563233 47.59471)
10 47326 12380 30 POINT (-1.562385 48.03841)
We see that the altitude is not measured on every sampling points, but only in 37 locations. Let’s plot them.
library(ggplot2)
set_theme(theme_minimal())
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
ggplot() +
geom_sf(data = points_geo_vilaine, color = "grey80", alpha = 0.5) +
geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
labs(
color = "Altitude (m)"
)We see in grey every location were we have sample of fish abundances, and in colour where we have altitude data. We have to interpolate altitude data for missing locations. The simplest way to do it is to the “nearest neighbour” interpolation.
2.3. Interpolation of altitude data
We first build proximity polygons with the terra::voronoi function.
library(terra)
d_elev <- select(pts_vilaine, c("geometry", "altitude"))
d_elev.vect <- terra::vect(d_elev)
v <- voronoi(d_elev.vect)
plot(v, "altitude")
points(d_elev.vect)v.sf <- st_as_sf(v)
pred <- st_intersection(v.sf, points_geo_vilaine)
d_elev.nn <- select(pred, c("altitude", "pop_id", "geometry"))
ggplot() +
geom_sf(data = pred, aes(color = altitude)) +
geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
labs(
color = "Altitude (m)"
)Predicted altitudes seem consistent with the nearest neighbour interpolation.
2.4. Cross-validation of altitude interpolation
Now, that we have a prediction pipeline working we will estimate its performance using cross-validation. To do so, we will do a LOOCV (Leave-One-Out Cross-Validation).
n <- nrow(d_elev)
preds <- numeric(n) # Vector filled with 0s.
for (i in 1:n) {
train <- d_elev[-i, ]
test <- d_elev[i, ]
v_train <- voronoi(terra::vect(train))
preds[i] <- st_intersection(st_as_sf(v_train), test)$altitude
}
d_elev$altitude.loo <- preds
ggplot(d_elev, aes(x = altitude, y = altitude.loo)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(
x = "Observed altitude (m)",
y = "Predicted altitude (m)"
)rmse <- sqrt(mean((d_elev$altitude - d_elev$altitude.loo)^2))
rmse[1] 18.74797
3. Abundance data
3.1. Load data
catches <- catches |>
rename(
species = esp_code_alternatif,
year = annee,
abundance = effectif
)
catches# A tibble: 69,823 × 6
sta_id pop_id ope_id year species abundance
<int> <int> <int> <dbl> <chr> <int>
1 8530 31870 19943 2017 TRF 159
2 8530 31870 19943 2017 VAI 82
3 8530 31870 19944 2011 TRF 217
4 8530 31870 19944 2011 VAI 7
5 8530 31870 19945 2009 TRF 232
6 8530 31870 19945 2009 VAI 9
7 8530 31870 19946 2007 TRF 213
8 8530 31870 19946 2007 VAI 36
9 8530 31870 83029 2019 TRF 126
10 8530 31870 83029 2019 VAI 104
# ℹ 69,813 more rows
species_names <- unique(catches$species)Next, let’s add zeros where species are not found.
catches <- catches |>
pivot_wider(
names_from = species,
values_from = abundance,
values_fill = list(abundance = 0)
) |>
pivot_longer(
cols = species_names,
names_to = "species",
values_to = "abundance"
)
catches# A tibble: 450,606 × 6
sta_id pop_id ope_id year species abundance
<int> <int> <int> <dbl> <chr> <int>
1 8530 31870 19943 2017 TRF 159
2 8530 31870 19943 2017 VAI 82
3 8530 31870 19943 2017 LOF 0
4 8530 31870 19943 2017 CHA 0
5 8530 31870 19943 2017 SDF 0
6 8530 31870 19943 2017 PFL 0
7 8530 31870 19943 2017 BAF 0
8 8530 31870 19943 2017 CHE 0
9 8530 31870 19943 2017 GOU 0
10 8530 31870 19943 2017 OBR 0
# ℹ 450,596 more rows
3.2. Preliminary plots
Let’s plot abundance histogram for each species. As most species abundance are dominated by zeros, because of absence data, we plot log(abundance + 1) instead of raw abundances. This also implies that we will need to use zero-inflated distributions in our statistical model to account for this absence data.
catches |>
ggplot(aes(x = abundance + 1)) + # add 1 to avoid log(0)
geom_histogram(binwidth = 0.5) +
scale_x_log10() +
facet_wrap(~species)Let’s plot abundance time series for the 10 most abundant species.
n_species <- 10
common_species <- catches |>
group_by(species) |>
summarise(mean_abundance = mean(abundance)) |>
arrange(desc(mean_abundance)) |>
slice(1:n_species) |>
pull(species)
catches |>
filter(species %in% common_species) |>
group_by(year, species) |>
summarise(mean_abundance = mean(abundance)) |>
ggplot(aes(x = year, y = mean_abundance, colour = species)) +
geom_line()There is no clear trend, but we can’t say much at this point because trend may be masked by sampling efforts and environmental covariates.
Here it would be interesting to plot the number of operations per year to see the evolution of the sampling effort.
d_eel <- catches |> filter(species == "ANG")
d_eel.avg <- d_eel |>
group_by(pop_id) |>
summarise(abundance_mean = mean(abundance))
data <- inner_join(d_elev.nn, d_eel.avg, by = "pop_id")
ggplot(data, aes(x = altitude, y = abundance_mean)) +
geom_point() +
labs(
x = "Altitude (m)",
y = "Eel abundance"
)We clearly observe the expected trend of eel abundance with altitude.
Next, we want to model the effect of altitude, while accounting for time using INLA.
4. Statistical models
Data is ready to model eel abundances in space, time and with altitude. In this section, we will build increasingly complex models.
Before doing anything fancy, let’s load INLA, and scale our predictor variable (altitude) as it is good practice.
library(INLA)
data <- inner_join(d_eel, d_elev.nn, by = "pop_id")
data$altitude_s <- scale(data$altitude)4.1. Abundance distribution and overdisperion
Here, we focus on the choice of the distribution of our target variable: eel abundance. We will focus particularly on overdispersion, as we know that eel distribution is zero-inflated due to absence data. For the following model, we add a random effect for stations and year. We will discuss in further details these points in following sections.
4.1.1. Poisson model
We begin with a poisson GLM.
m.poisson <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "iid") +
f(pop_id, model = "iid"),
family = "poisson",
data = data,
control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.poisson)Time used:
Pre = 2.3, Running = 0.447, Post = 0.129, Total = 2.88
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) 1.921 0.165 1.594 1.922 2.244 1.922 0
altitude_s -1.404 0.169 -1.739 -1.403 -1.075 -1.403 0
Random effects:
Name Model
year IID model
pop_id IID model
Model hyperparameters:
mean sd 0.025quant 0.5quant 0.975quant mode
Precision for year 10.86 2.719 6.395 10.57 17.03 10.03
Precision for pop_id 1.26 0.331 0.723 1.22 2.02 1.15
Deviance Information Criterion (DIC) ...............: 4552.01
Deviance Information Criterion (DIC, saturated) ....: 2685.27
Effective number of parameters .....................: -737.88
Watanabe-Akaike information criterion (WAIC) ...: 10992.89
Effective number of parameters .................: 2810.03
Marginal log-Likelihood: -3209.39
is computed
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
We can plot the distribution of the fixed effect (altitude).
marginal.altitude <- m.poisson$marginals.fixed$altitude_s
ggplot(as.data.frame(marginal.altitude)) +
geom_line(aes(x = x, y = y)) +
labs(
x = "Altitude effect size",
y = "Probability"
)We can compute the highest posterior density (HDP) credible interval.
inla.hpdmarginal(0.97, m.poisson$marginals.fixed$altitude_s) low high
level:0.97 -1.774432 -1.037384
We can also sample from the posterior distribution.
m.poisson.samples <- inla.posterior.sample(100, m.poisson)We represent the sampled effect size of altitude against the marginal distribution obtained before.
fun <- function(...) {
altitude_s
}
altitude.sample <- inla.posterior.sample.eval(fun, m.poisson.samples)
tab <- data.frame(altitude = altitude.sample[1, ])
ggplot(tab, aes(x = altitude)) +
geom_histogram(aes(y = after_stat(density)), bins = 18) +
geom_line(data = as.data.frame(marginal.altitude), aes(x = x, y = y)) +
labs(
x = "Altitude effect size",
y = "Density"
)Next, we want to simulate data from the model. This step is important as it allows us to see if our model makes sense or not. In particular, we identify where our model fails and improve it.
alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
fun = function(...) {
mu <- exp(Intercept + altitude_s * alt_vals)
abundance <- rpois(length(mu), mu)
},
samples = m.poisson.samples
)
library(bayestestR)
y.sum <- apply(y.sample, 1, function(x) {
y.hdi <- hdi(x, ci = 0.89)
c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
altitude_s = alt_vals,
t(y.sum)
)
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
# geom_point(data = data, aes(x = altitude_s, y = abundance), colour = "grey80") +
geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
geom_line(colour = "steelblue") +
labs(
x = "Scaled altitude",
y = "Eel abundance"
)Next, we want to plot actual data against this prediction. Because we have not included spatial and temporal random effects while sampling the posterior, we are making predictions for an average site and year. Therefore, we want to average abundance data over site and year.
data.avg <- data |>
group_by(altitude_s) |>
summarise(
abundance_avg = mean(abundance),
abundance_var = var(abundance)
)
data.avg# A tibble: 32 × 3
altitude_s[,1] abundance_avg abundance_var
<dbl> <dbl> <dbl>
1 -0.979 13.0 52.4
2 -0.954 35.1 416.
3 -0.929 50.2 1692.
4 -0.904 30.3 235.
5 -0.830 43.1 2204.
6 -0.780 50.1 1004.
7 -0.705 37.6 350.
8 -0.680 42.4 624.
9 -0.655 26.6 332.
10 -0.630 9.44 15.3
# ℹ 22 more rows
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
geom_line(colour = "steelblue") +
labs(
x = "Scaled altitude",
y = "Eel abundance"
)We see that our model capture the clear decreasing trend of eel abundances with altitude.
Let’s investigate overdispersion with a posterior predictive check.
y_pred_avg <- apply(y.sample, 1, mean)
y_pred_var <- apply(y.sample, 1, var)
pred_df <- data.frame(
abundance_avg = y_pred_avg,
abundance_var = y_pred_var
)
pred_df$type <- "Posterior predictive"
pred_df$altitude_s <- alt_vals
data.avg$type <- "Observed"
plot_df <- rbind(pred_df, data.avg)
ggplot(plot_df, aes(x = abundance_avg, y = abundance_var, colour = type)) +
geom_point() +
geom_abline(slope = 1, intercept = 1) +
labs(
x = "Observed mean abundance",
y = "Observed variance in abundace"
)We see clearly that observed data is overdispersed (variance >> mean) compared to the data generated by our model. This suggest that the poisson distribution is not suited for this task.
4.1.2. Negative binomial model
To account for overdispersion, we will test the negative binomial distribution. Basically, we will repeat the previous section but with a model where poisson distribution is replaced with a negative binomial one.
m.nbinom <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "iid") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.nbinom)Time used:
Pre = 0.859, Running = 0.553, Post = 0.114, Total = 1.53
Fixed effects:
mean sd 0.025quant 0.5quant 0.975quant mode kld
(Intercept) 2.036 0.158 1.721 2.037 2.344 2.037 0
altitude_s -1.412 0.164 -1.737 -1.412 -1.092 -1.412 0
Random effects:
Name Model
year IID model
pop_id IID model
Model hyperparameters:
mean sd 0.025quant
size for the nbinomial observations (1/overdispersion) 2.18 0.185 1.837
Precision for year 18.59 8.697 7.425
Precision for pop_id 1.46 0.411 0.806
0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion) 2.17 2.56
Precision for year 16.73 40.79
Precision for pop_id 1.41 2.41
mode
size for the nbinomial observations (1/overdispersion) 2.16
Precision for year 13.63
Precision for pop_id 1.31
Deviance Information Criterion (DIC) ...............: 3440.05
Deviance Information Criterion (DIC, saturated) ....: 622.11
Effective number of parameters .....................: 53.88
Watanabe-Akaike information criterion (WAIC) ...: 3443.63
Effective number of parameters .................: 50.77
Marginal log-Likelihood: -1783.55
is computed
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
Let’s plot the model prediction against actual data.
m.nbinom.samples <- inla.posterior.sample(1000, m.nbinom)
alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
fun = function(...) {
mu <- exp(Intercept + altitude_s * alt_vals)
abundance <- rnbinom(length(mu), mu = mu, size = theta[1])
},
samples = m.nbinom.samples
)
y.sum <- apply(y.sample, 1, function(x) {
y.hdi <- hdi(x, ci = 0.89)
c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
altitude_s = alt_vals,
t(y.sum)
)
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
geom_line(colour = "steelblue") +
labs(
x = "Scaled altitude",
y = "Eel abundance"
)We see that the negative binomial distribution better account for the variance. We will stick to this distribution for now, although we could have tried other distributions. Notably, the ZIP-poisson could be useful when we have many 0s.
4.2. Modeling time
Let’s now focus on modelling the effect of time. To begin with, let’s inspect the raw data.
ggplot(data, aes(x = year, y = abundance, color = altitude_s)) +
geom_jitter(width = 0.2) +
stat_summary(fun = mean, geom = "line", colour = "grey", size = 1) +
labs(x = "Year", ylab = "Eel abundance", color = "Altitude (scaled)")The plain line indicates the trend of the mean abundance over all sites. This trend is decreasing, we expect to retrieve this trend in our temporal random effects.
4.2.1. IID random effect
We begin by assuming that year random effects are independent from one another. We expect this assumption to be wrong, but let’s start from there and then explore more realistic modeling choices.
m.iid <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "iid") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.year.iid <- m.iid$summary.random$yearHere was our first try, yet we expect some correlation from one year to another. For this reason, time random effect are often modelled with autoregressive model (AR1 if lag 1).
4.2.2. AR1 Random effect
We modify the previous model changing the year random effect from IID to AR1.
m.ar <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "ar1") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.year.ar <- m.ar$summary.random$year4.2.3. RW1 random effect
Same with RW1 random effect.
m.rw <- inla(
abundance ~ 1 + altitude_s +
f(year, model = "rw1") +
f(pop_id, model = "iid"),
family = "nbinomial",
data = data,
control.compute = list(config = TRUE, waic = TRUE)
)
re.year.rw <- m.rw$summary.random$year4.2.4. Model comparisons
Now that we have the three models, we can compare them.
First, we can plot random effect vs year.
re.year.iid$type <- "iid"
re.year.ar$type <- "ar"
re.year.rw$type <- "rw"
re.year.all <- rbind(re.year.iid, re.year.ar, re.year.rw)
ggplot(re.year.all, aes(x = ID, y = mean, color = type)) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), alpha = 0.1) +
geom_line() +
geom_point() +
labs(
x = "Year",
y = "Year random effect",
)We see that AR1 and RW1 produce random effects with a smooth trend. By contrast, IID random effects produces ‘chaotic’ random effects due to their independence. Second, we can inspect model WAIC, where lower values implies better out-of-sample predictive performance.
m.iid$waic$waic[1] 3443.63
m.ar$waic$waic[1] 3428.841
m.rw$waic$waic[1] 3428.494
We see that AR1 and RW1 have lower WAIC signaling better predictive ability, as we could have expected.
We can also do a predictive check by plotting predicted trends against the observed one.
pred_iid <- tapply(m.iid$summary.fitted.values$mean, data$year, mean)
pred_ar1 <- tapply(m.ar$summary.fitted.values$mean, data$year, mean)
pred_rw1 <- tapply(m.rw$summary.fitted.values$mean, data$year, mean)
obs <- tapply(data$abundance, data$year, mean)
df <- data.frame(
year = sort(unique(data$year)),
obs = obs,
iid = pred_iid,
ar1 = pred_ar1,
rw1 = pred_rw1
)
df <- df |> pivot_longer(cols = c("obs", "iid", "ar1", "rw1"), names_to = "type", values_to = "mean_abundance")
ggplot(df, aes(x = year, y = mean_abundance, color = type)) +
geom_line() +
labs(
y = "Mean abundance",
x = "Year"
)Here, we see that there is no strong differences: all three models track well the observed trend. The IID model might be a bit overfitting the trend at the beginning, but otherwise there is not much to say.